{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Chapter 5: Text Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch \n",
    "import torch.nn as nn\n",
    "import numpy as np\n",
    "from torchtext import data \n",
    "import torchtext\n",
    "from pathlib import Path\n",
    "import pandas as pd\n",
    "import spacy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Loading & Data Cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "device = \"cuda\"\n",
    "# You'll probably need to use the 'python' engine to load the CSV\n",
    "# tweetsDF = pd.read_csv(\"training.1600000.processed.noemoticon.csv\", header=None)\n",
    "tweetsDF = pd.read_csv(\"training.1600000.processed.noemoticon.csv\", \n",
    "engine=\"python\", header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tweetsDF[0].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tweetsDF[\"sentiment_cat\"] = tweetsDF[0].astype('category')\n",
    "tweetsDF[\"sentiment\"] = tweetsDF[\"sentiment_cat\"].cat.codes\n",
    "tweetsDF.to_csv(\"train-processed.csv\", header=None, index=None)      \n",
    "tweetsDF.sample(10000).to_csv(\"train-processed-sample.csv\", header=None, index=None) \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "LABEL = data.LabelField()\n",
    "TWEET = data.Field(tokenize='spacy', lower=true)\n",
    "\n",
    "fields = [('score',None), ('id',None),('date',None),('query',None),\n",
    "      ('name',None),\n",
    "      ('tweet', TWEET),('category',None),('label',LABEL)] "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create our Dataset and DataLoaders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "twitterDataset = torchtext.data.TabularDataset(\n",
    "        path=\"training-processed.csv\", \n",
    "        format=\"CSV\", \n",
    "        fields=fields,\n",
    "        skip_header=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(train, test, valid) = twitterDataset.split(split_ratio=[0.8,0.1,0.1])\n",
    "\n",
    "(len(train),len(test),len(valid))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab_size = 20000\n",
    "TWEET.build_vocab(train, max_size = vocab_size)\n",
    "TWEET.vocab.freqs.most_common(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(\n",
    "(train, valid, test), \n",
    "batch_size = 32,\n",
    "device = device)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Our First LSTM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class OurFirstLSTM(nn.Module):\n",
    "    def __init__(self, hidden_size, embedding_dim, vocab_size):\n",
    "        super(OurFirstLSTM, self).__init__()\n",
    "    \n",
    "        self.embedding = nn.Embedding(vocab_size, embedding_dim)\n",
    "        self.encoder = nn.LSTM(input_size=embedding_dim,  \n",
    "                hidden_size=hidden_size, num_layers=1)\n",
    "        self.predictor = nn.Linear(hidden_size, 2)\n",
    "\n",
    "    def forward(self, seq):\n",
    "        output, (hidden,_) = self.encoder(self.embedding(seq))\n",
    "        preds = self.predictor(hidden.squeeze(0))\n",
    "        return preds\n",
    "\n",
    "model = OurFirstLSTM(100,300, 20002)\n",
    "model.to(device)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "optimizer = optim.Adam(model.parameters(), lr=2e-2)\n",
    "criterion = nn.CrossEntropyLoss()\n",
    "\n",
    "def train(epochs, model, optimizer, criterion, train_iterator, valid_iterator):\n",
    "    for epoch in range(1, epochs + 1):\n",
    "     \n",
    "        training_loss = 0.0\n",
    "        valid_loss = 0.0\n",
    "        model.train()\n",
    "        for batch_idx, batch in enumerate(train_iterator):\n",
    "            opt.zero_grad()\n",
    "            predict = model(batch.tweet)\n",
    "            loss = criterion(predict,batch.label)\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "            training_loss += loss.data.item() * batch.tweet.size(0)\n",
    "        training_loss /= len(train_iterator)\n",
    " \n",
    "        \n",
    "        model.eval()\n",
    "        for batch_idx,batch in enumerate(valid_iterator):\n",
    "            predict = model(batch.tweet)\n",
    "            loss = criterion(predict,batch.label)\n",
    "            valid_loss += loss.data.item() * x.size(0)\n",
    " \n",
    "        valid_loss /= len(valid_iterator)\n",
    "        print('Epoch: {}, Training Loss: {:.2f}, \n",
    "        Validation Loss: {:.2f}'.format(epoch, training_loss, valid_loss))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Making predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def classify_tweet(tweet):\n",
    "    categories = {0: \"Negative\", 1:\"Positive\"}\n",
    "    processed = TWEET.process([TWEET.preprocess(tweet)])\n",
    "    return categories[model(processed).argmax().item()]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data Augmentation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def random_deletion(words, p=0.5):\n",
    "    if len(words) == 1:\n",
    "        return words\n",
    "    remaining = list(filter(lambda x: random.uniform(0,1) > p,words))\n",
    "    if len(remaining) == 0:\n",
    "        return [random.choice(words)]\n",
    "    else\n",
    "        return remaining"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def random_swap(sentence, n=5):\n",
    "    length = range(len(sentence))\n",
    "    for _ in range(n):\n",
    "        idx1, idx2 = random.sample(length, 2)\n",
    "        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1]\n",
    "    return sentence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Note: you'll have to define remove_stopwords() and get_synonyms() elsewhere\n",
    "\n",
    "def random_insertion(sentence,n):\n",
    "    words = remove_stopwords(sentence)\n",
    "    for _ in range(n):\n",
    "        new_synonym = get_synonyms(random.choice(words))\n",
    "        sentence.insert(randrange(len(sentence)+1), new_synonym)\n",
    "    return sentence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import googletrans\n",
    "import random\n",
    "\n",
    "translator = googletrans.Translator()\n",
    "\n",
    "sentences = ['The cat sat on the mat']\n",
    "\n",
    "translations_fr = translator.translate(sentences, dest='fr')\n",
    "fr_text = [t.text for t in translations_fr] \n",
    "translations_en = translator.translate(fr_text, dest='en')\n",
    "en_text = [t.text for t in translations_en]\n",
    "print(en_text)   \n",
    "\n",
    "available_langs = list(googletrans.LANGUAGES.keys())\n",
    "tr_lang = random.choice(available_langs)\n",
    "print(f\"Translating to {googletrans.LANGUAGES[tr_lang]}\")\n",
    "\n",
    "translations = translator.translate(sentences, dest=tr_lang)\n",
    "t_text = [t.text for t in translations]\n",
    "print(t_text)\n",
    "\n",
    "translations_en_random = translator.translate(t_text, src=tr_lang, dest='en')\n",
    "en_text = [t.text for t in translations_en_random]\n",
    "print(en_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
